# 20180609
# author：葛木瓜
# 实现从西安物价局官网抓取房价包

import sys
sys.path.append('./')
from bs4 import BeautifulSoup
from urllib import request
from selenium import webdriver
from byCsv import *
import os
import re


class GetPriceZip:

    """
    获取物价压缩包
    """

    def __init__(self, url, flag):
        """
        将页面用BeautifulSoup库处理
        :return:
        """
        self.cur_path = os.path.dirname(__file__)
        self.csv_fp = 'csvData/newest_trid.csv'
        self.url = url
        # self.driver = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true', '--ssl-protocol=TLSv1'])
        self.driver = webdriver.PhantomJS(executable_path='D:\\Python36\\phantomjs.exe')
        self.driver.get(self.url)
        if flag == 0:
            # self.driver.implicitly_wait(5)
            self.driver.switch_to.frame('iframecenter')
        elif flag == 1:
            # self.driver.implicitly_wait(5)
            self.driver.switch_to.frame('showconent1')
        self.soup = BeautifulSoup(self.driver.page_source, 'html.parser')

    def get_first_level_url(self):

        """
        获取价格一级url
        :return:
        """
        base_url = 'http://wjj.xa.gov.cn/ptl/def/def/index_1285_3892_ci_trid_'
        first_level_url_lst = []
        num = 0
        if os.path.exists(self.csv_fp):
            newest_trid = int(read_csv(self.csv_fp, 1)[-1])
            first_level_url = self.soup.find_all(href=re.compile('trid'))
            # print(first_level_url)
            for first_level_url_i in first_level_url:
                trid = int(first_level_url_i['href'].split('=')[-1])
                if trid > newest_trid:
                    first_level_url_lst.append(base_url + str(trid) + '.html')
                    num += 1
                else:
                    break
            if num != 0:
                write_csv(self.csv_fp, [first_level_url[0].text.strip(), first_level_url[0]['href'].split('=')[1]])
        else:
            first_level_url = self.soup.find_all(href=re.compile('trid'))
            write_csv(self.csv_fp, [first_level_url[0].text.strip(), first_level_url[0]['href'].split('=')[1]])
        return first_level_url_lst

    def get_prize_zip(self):

        """
        获取价格zip
        :return:
        """
        first_level_lst = GetPriceZip(self.url, flag=0).get_first_level_url()
        if first_level_lst:
            for url_i in first_level_lst:
                # driver.get(line_url)
                # driver.implicitly_wait(15)
                # driver.switch_to.frame('showconent1')
                # download_url = driver.find_element_by_partial_link_text('商品住房价格')
                # download_url = download_url.get_attribute('href')
                # zipname = cur_path + '/house_prices/' + download_url.split('/')[6]
                # filename = zipname.split('.')[0]
                # request.urlretrieve(download_url, zipname)

                GetPriceZip(url_i, flag=1)
                print(self.soup)
                download_url = self.soup.find_all(href=re.compile('.rar'))
                print(download_url)

                # GetPriceZip(url_i, flag=1)
                #
                # driver = self.driver
                # download_url = driver.find_element_by_partial_link_text('商品住房价格').text
                # download_url = download_url.get_attribute('href')
                # print(download_url)
                # zipname = self.cur_path + '/zip/' + download_url.split('/')[6]
                # # filename = zipname.split('.')[0]
                # request.urlretrieve(download_url, zipname)



if __name__ == '__main__':

    url_ = 'http://wjj.xa.gov.cn/ptl/def/def/index_1285_3887_ci_trid_4416419.html'
    GetPriceZip(url_, flag=0).get_prize_zip()
    # driver.quit()
    # print('Download Over !!!')
    # http://wjj.xa.gov.cn/attached/file/20180912/20180912181522_836_2955288.rar
    # 'http://wjj.xa.gov.cn/ptl/def/def/index_1285_3892.jsp?trid=2957936'



